This R markdown file summarizes Simulation Study results.
rm(list=ls()) # clean up workspace
setwd("/Users/xji3/GitFolders/YeastIGCTract/SimulationStudy/")
Tract.list <- c(3.0, 10.0, 50.0, 100.0, 200.0, 300.0, 400.0, 500.0)
# First read in HMM results
# from summary file
for(tract in Tract.list){
hmm.tract.summary <- NULL
for(sim in 1:100){
hmm.summary <- paste("./summary/Tract_", toString(tract), '.0/sim_',
toString(sim), '/HMM_YDR418W_YEL054C_MG94_nonclock_sim_',
toString(sim), '_1D_summary.txt', sep = "")
if (file.exists(hmm.summary)){
all <- readLines(hmm.summary, n = -1)
col.names <- paste("sim_", toString(sim), sep = "")
row.names <- strsplit(all[length(all)], ' ')[[1]][-1]
summary_mat <- as.matrix(read.table(hmm.summary,
row.names = row.names,
col.names = col.names))
hmm.tract.summary <- cbind(hmm.tract.summary, summary_mat)
}
}
assign(paste("HMM_Tract_", toString(tract), "_summary", sep = ""), hmm.tract.summary)
}
# from plots
for(tract in Tract.list){
hmm.tract.plots <- NULL
for(sim in 1:100){
hmm.plot <- paste("./plot/Tract_", toString(tract), '.0/sim_',
toString(sim), '/HMM_YDR418W_YEL054C_lnL_sim_',
toString(sim), '_1D_surface.txt', sep = "")
if (file.exists(hmm.plot)){
lnL.surface <- read.table(hmm.plot)
max.idx <- which.max(lnL.surface[, 2])
new.summary <- matrix(c(3.0*exp(-lnL.surface[max.idx, 1]), lnL.surface[max.idx, 2]), 2, 1)
rownames(new.summary) <- c("tract in nt", "lnL")
colnames(new.summary) <- paste("sim_", toString(sim), sep = "")
hmm.tract.plots <- cbind(hmm.tract.plots, new.summary)
}
}
assign(paste("HMM_Tract_", toString(tract), "_plot", sep = ""), hmm.tract.plots)
}
# Now read in actual mean tract length in each simulated dataset
for (tract in Tract.list){
sim.tract <- NULL
for(sim in 1:100){
sim_log <- paste("./Tract_", toString(tract), ".0_HKY/sim_", toString(sim),
"/YDR418W_YEL054C_sim_", toString(sim), "_IGC.log", sep = "")
# now read in log file
log_info <- read.table(sim_log, header = TRUE)
performed.tract.length <- log_info[, "stop_pos"] - log_info[, "start_pos"] + 1
proposed.tract.length <- log_info[, "tract_length"]
diff.tracts <- log_info[, "num_diff"] > 0
new.info <- matrix(c(dim(log_info)[1], mean(proposed.tract.length), sd(proposed.tract.length),
mean(performed.tract.length), sd(performed.tract.length),
mean(proposed.tract.length[diff.tracts]),
mean(performed.tract.length[diff.tracts])), 7, 1)
rownames(new.info) <- c("num IGC", "mean proposed tract length", "sd proposed tract length",
"mean performed tract length", "sd performed tract length",
"mean proposed nonidentical tract length", "mean performed nonidentical tract length")
colnames(new.info) <- paste("sim_", toString(sim), sep = "")
sim.tract <- cbind(sim.tract, new.info)
}
assign(paste("sim.tract.", toString(tract), sep = ""), sim.tract)
}
guess.list <- c(50.0, 100.0, 250.0, 500.0)
# Now read in PSJS summary results
for(tract in Tract.list){
for(guess in guess.list){
PSJS.tract.summary <- NULL
for(sim in 1:100){
PSJS.summary <- paste("./summary/Tract_", toString(tract), '.0_HKY/sim_',
toString(sim), '/PSJS_HKY_rv_sim_',
toString(sim), "_Tract_", toString(tract), '.0_guess_',
toString(guess),'.0_nt_summary.txt', sep = "")
if (file.exists(PSJS.summary)){
all <- readLines(PSJS.summary, n = -1)
col.names <- paste("sim_", toString(sim), sep = "")
row.names <- strsplit(all[length(all)], ' ')[[1]][-1]
summary_mat <- as.matrix(read.table(PSJS.summary,
row.names = row.names,
col.names = col.names))
PSJS.tract.summary <- cbind(PSJS.tract.summary, summary_mat)
}
}
assign(paste("PSJS_HKY_Tract_", toString(tract), "_guess_",
toString(guess), "_summary", sep = ""), PSJS.tract.summary)
}
}
# Now combine all initial guess results
for(tract in Tract.list){
combined.PSJS.tract.summary <- NULL
col.list <- NULL
for ( sim_num in 1:100){
sim_col <- paste("sim_", toString(sim_num), sep = "")
best.lnL <- -Inf
best.guess <- NULL
for(guess in guess.list){
target_summary <- get(paste("PSJS_HKY_Tract_", toString(tract), "_guess_", toString(guess), "_summary", sep = ""))
if(sim_col %in% colnames(target_summary) ){
if (target_summary["ll", sim_col] > best.lnL){
best.lnL <- target_summary["ll", sim_col]
best.guess <- guess
}
}
}
if(! is.null(best.guess)){
combined.PSJS.tract.summary <- cbind(combined.PSJS.tract.summary,
get(paste("PSJS_HKY_Tract_", toString(tract), "_guess_", toString(best.guess), "_summary", sep = ""))[, sim_col])
col.list <- c(col.list, sim_col)
}
}
colnames(combined.PSJS.tract.summary) <- col.list
assign(paste("PSJS_HKY_Tract_", toString(tract), "_combined_summary", sep = ""), combined.PSJS.tract.summary)
}
OK, Now show the performance summary
# # HMM results
# for (tract in Tract.list){
# # show how many stuck at boundary 1000 nt first
# print(paste("Tract = ", toString(tract), sep = ""))
# summary_mat <- get(paste("HMM_Tract_", toString(tract), "_plot", sep = ""))
#
# # histogram of inferred tract length
# hist(summary_mat[1, summary_mat[1, ] < 999.], main = paste("Tract = ", toString(tract), sep = ""))
# print(paste("Among total 100 simulated data sets, ", toString(sum(summary_mat[1, ] > 999)),
# " datasets stuck at 1000", sep = ""))
# print(matrix(c("mean", mean(summary_mat[1, summary_mat[1, ] < 999.]),
# "sd", sd(summary_mat[1, summary_mat[1, ] < 999.])), 2, 2))
# }
#
# # PSJS results
# for(tract in Tract.list){
# summary_mat <- get(paste("PSJS_HKY_Tract_", toString(tract), "_combined_summary", sep = ""))
# # histogram of inferred tract length
# hist(summary_mat["tract_length", ], main = paste("Tract = ", toString(tract), sep = ""))
# print(matrix(c("mean", mean(summary_mat["tract_length", ]),
# "sd", sd(summary_mat["tract_length", ])), 2, 2))
# }
#
# # exclude suspiciously long inferred tract length, plot again
# for(tract in Tract.list){
# summary_mat <- get(paste("PSJS_HKY_Tract_", toString(tract), "_combined_summary", sep = ""))
# # histogram of inferred tract length
# hist(summary_mat["tract_length", summary_mat["tract_length", ] < 1000.0], main = paste("Tract = ", toString(tract), sep = ""))
# print(matrix(c("mean", mean(summary_mat["tract_length", summary_mat["tract_length", ] < 1000.0]),
# "sd", sd(summary_mat["tract_length", summary_mat["tract_length", ] < 1000.0])), 2, 2))
# }
A plot of HMM surface that infer tract length at boundary from each tract length condition
# # plot the first two datasets that HMM inferred tract length stuck at boundary of 1000.0
# for (tract in Tract.list){
# print(paste("Tract = ", toString(tract), sep = ""))
# summary_mat <- get(paste("HMM_Tract_", toString(tract), "_plot", sep = ""))
# count = 0
# for(iter in 1:99){
# if(summary_mat[1, iter] > 999. & count < 2){
# to.plot <- read.table(paste("./plot/Tract_", toString(tract), ".0/", colnames(summary_mat)[iter],
# "/HMM_YDR418W_YEL054C_lnL_", colnames(summary_mat)[iter],
# "_1D_surface.txt", sep = ""))
# plot(3.0*exp(-to.plot[, 1]), to.plot[, 2], main = paste("Tract = ", toString(tract), " ",colnames(summary_mat)[iter], sep = "" ), type = "l", xlab = "tract length in nucleotides",
# ylab = "lnL")
# count = count + 1
# }
# }
# }
Now plot the two plots of lnL
# library(ggplot2)
# # Tract length = 10
# hmm.plot <- read.table("./plot/Tract_10.0/sim_1/HMM_YDR418W_YEL054C_lnL_sim_1_1D_surface.txt")
# PSJS.plot <- read.table("./plot/Tract_10.0/sim_1/PSJS_HKY_rv_sim_1_Tract_10.0_lnL_1D_surface.txt")
# ggplot(mapping = aes(x = 3.0*exp(-hmm.plot[,1]), y = hmm.plot[, 2], colour = "blue")) + geom_line() +
# geom_line(aes(x = exp(-PSJS.plot[,1]), y = PSJS.plot[, 2]/488 + min(hmm.plot[, 2]) - min(PSJS.plot[, 2]/488),
# colour = "red")) +
# xlab("Tract length in nucleotides") +
# ylab("lnL") +
# ggtitle("Tract length = 10, simulated dataet 1")
#
# hmm.plot <- read.table("./plot/Tract_10.0/sim_100/HMM_YDR418W_YEL054C_lnL_sim_100_1D_surface.txt")
# PSJS.plot <- read.table("./plot/Tract_10.0/sim_100/PSJS_HKY_rv_sim_100_Tract_10.0_lnL_1D_surface.txt")
# ggplot(mapping = aes(x = 3.0*exp(-hmm.plot[,1]), y = hmm.plot[, 2], colour = "blue")) + geom_line() +
# geom_line(aes(x = exp(-PSJS.plot[,1]), y = PSJS.plot[, 2]/488 + min(hmm.plot[, 2]) - min(PSJS.plot[, 2]/488),
# colour = "red")) +
# xlab("Tract length in nucleotides") +
# ylab("lnL") +
# ggtitle("Tract length = 10, simulated dataet 100")
Now see how estimates from the two approaches differ from the actual mean tract length in each simulated data set.
# for(tract in Tract.list){
# sim.info <- get(paste("sim.tract.", toString(tract), sep = ""))
# # Show mean and sd
# print(matrix(c("proposed mean", mean(sim.info["mean proposed tract length", ]),
# "performed mean", mean(sim.info["mean performed tract length", ]),
# "geometric mean", tract,
# "proposed sd", mean(sim.info["sd proposed tract length",], na.rm = TRUE),
# "performed sd", mean(sim.info["sd performed tract length",], na.rm = TRUE),
# "geometric sd", sqrt(tract^2-tract*3.0)), 2, 6))
# hmm.info <- get(paste("HMM_Tract_", toString(tract), "_plot", sep = ""))
# PSJS.info <- get(paste("PSJS_Tract_", toString(tract), "_summary", sep = ""))
# shared.col <- intersect(colnames(hmm.info), colnames(PSJS.info))
#
# # Now show the ratio of HMM estimated tract / actual mean tracts in simulation
# hmm.ratio <- hmm.info[1, shared.col]/sim.info[1, shared.col]
# hist(hmm.ratio, main = paste("HMM ratio Tract = ", toString(tract), sep = ""))
# print(matrix(c("HMM mean", mean(hmm.ratio), "HMM sd", sd(hmm.ratio)), 2, 2))
#
# # Now show the ratio of PSJS estimated tract / actual mean tracts in simulation
# PSJS.ratio <- PSJS.info["tract_length", shared.col]/sim.info[1, shared.col]
# hist(PSJS.ratio, main = paste("PSJS ratio Tract = ", toString(tract), sep = ""))
# print(matrix(c("PSJS mean", mean(PSJS.ratio), "PSJS sd", sd(PSJS.ratio)), 2, 2))
#
# }
Show the PSJS estimated results for Simulated datasets with estimated tau
for(tract in Tract.list){
target_summary <- get(paste("PSJS_HKY_Tract_", toString(tract), "_combined_summary", sep = ""))
col.names <- target_summary["tract_length", ] < 10*tract
hist(target_summary["tract_length", col.names],
main = paste("Tract = ", toString(tract), ".0 combined PSJS HKY Results", sep = ""))
sim_info <- get(paste("sim.tract.", toString(tract), sep = ""))
plot(sim_info["num IGC", ], sim_info["mean proposed tract length", ],
main = paste("Simulation info of Tract ", toString(tract), sep = ""),
xlab = "number of IGC events", ylab = "mean proposed tract length")
abline(h = tract, col = "red")
plot(sim_info["num IGC", ], sim_info["mean performed tract length", ],
main = paste("Simulation Info of Tract ", toString(tract), sep = ""),
xlab = "number of IGC events", ylab = "mean performed tract length")
abline(h = tract, col = "red")
plot(sim_info["num IGC", colnames(target_summary)[col.names]], target_summary["tract_length", col.names],
main = paste("PSJS estimate of Tract ", toString(tract), sep = ""),
xlab = "number of IGC events", ylab = "PSJS estimated tract length")
abline(h = tract, col = "red")
plot(sim_info["num IGC", colnames(target_summary)[col.names]],
target_summary["tract_length", col.names] / sim_info["mean proposed tract length", colnames(target_summary)[col.names]],
main = paste(" Ratio of PSJS tract length over mean proposed tract length - Tract ", toString(tract), sep = ""),
xlab = "number of IGC events", ylab = "Ratio (PSJS/mean proposed tract length)")
abline(h = 1.0, col = "red")
plot(sim_info["num IGC", colnames(target_summary)[col.names]],
target_summary["tract_length", col.names] / sim_info["mean performed tract length", colnames(target_summary)[col.names]],
main = paste(" Ratio of PSJS tract length over mean performed tract length - Tract ", toString(tract), sep = ""),
xlab = "number of IGC events", ylab = "Ratio (PSJS/mean performed tract length)")
abline(h = 1.0, col = "red")
# Now plot kappa estimates
plot(sim_info["num IGC", colnames(target_summary)[col.names]],
target_summary["kappa", col.names],
main = paste("PSJS kappa, Tract = ", toString(tract), sep = ""),
xlab = "Num IGC", ylab = "PSJS estimated kappa")
abline(h = 14.46399, col = 2)
# Now plot r2 estimates
plot(sim_info["num IGC", colnames(target_summary)[col.names]],
target_summary["r2", col.names],
main = paste("PSJS r2, Tract = ", toString(tract), sep = ""),
xlab = "Num IGC", ylab = "PSJS estimated r2")
abline(h = 0.5391702, col = 2)
# Now plot r3 estimates
plot(sim_info["num IGC", colnames(target_summary)[col.names]],
target_summary["r3", col.names],
main = paste("PSJS r3, Tract = ", toString(tract), sep = ""),
xlab = "Num IGC", ylab = "PSJS estimated r3")
abline(h = 11.58006, col = 2)
# Now plot initiation rate estimates
plot(sim_info["num IGC", colnames(target_summary)[col.names]],
target_summary["init_rate", col.names],
main = paste("PSJS init_rate, Tract = ", toString(tract), sep = ""),
xlab = "Num IGC", ylab = "PSJS estimated init_rate")
abline(h = 22.58153 / tract, col = 2)
# Now plot tract p estimates
plot(sim_info["num IGC", colnames(target_summary)[col.names]],
1.0/target_summary["tract_length", col.names],
main = paste("PSJS tract_p, Tract = ", toString(tract), sep = ""),
xlab = "Num IGC", ylab = "PSJS estimated tract_p")
abline(h = 1.0 / tract, col = 2)
# Now plot Tau estimates
plot(sim_info["num IGC", colnames(target_summary)[col.names]],
target_summary["init_rate", col.names]*target_summary["tract_length", col.names],
main = paste("PSJS Tau, Tract = ", toString(tract), sep = ""),
xlab = "Num IGC", ylab = "PSJS estimated Tau")
abline(h = 22.58153, col = 2)
plot(sim_info["num IGC", colnames(target_summary)[col.names]],
target_summary["init_rate", col.names]*target_summary["tract_length", col.names]*3.0/(1.0 +target_summary["r2", col.names] + target_summary["r3", col.names]),
main = paste("PSJS Weighted Tau, Tract = ", toString(tract), sep = ""),
xlab = "Num IGC", ylab = "PSJS Tau*3/(1+r2+r3)")
abline(h = 5.16376333125, col = 2)
# Now plot initiation rate vs tract length
plot(target_summary["tract_length", col.names],
target_summary["init_rate", col.names],
main = paste("PSJS init_rate, Tract = ", toString(tract), sep = ""),
xlab = "Tract length", ylab = "initiation rate")
lines(sort(target_summary["tract_length", col.names]), 22.58153/sort(target_summary["tract_length", col.names]),
col = "red", type = "l")
plot(sim_info["mean performed tract length", colnames(target_summary)[col.names]],
target_summary["tract_length", col.names],
main = paste(" mean performed vs PSJS estimated Tract ", toString(tract), sep = ""),
xlab = "mean performed IGC tract length", ylab = "PSJS estimated")
abline(a= 0.0, b = 1.0, col = "red")
plot(sim_info["mean proposed tract length", colnames(target_summary)[col.names]],
target_summary["tract_length", col.names],
main = paste(" mean proposed vs PSJS estimated Tract ", toString(tract), sep = ""),
xlab = "mean proposed IGC tract length", ylab = "PSJS estimated")
abline(a = 0.0, b = 1.0, col = 2)
plot(sim_info["mean performed nonidentical tract length", colnames(target_summary)[col.names]],
target_summary["tract_length", col.names],
main = paste(" mean performed nonidentical vs PSJS estimated Tract ", toString(tract), sep = ""),
xlab = "mean performed nonidentical IGC tract length", ylab = "PSJS estimated")
abline(a= 0.0, b = 1.0, col = "red")
print(paste("Tract = ", toString(tract), ".0 combined PSJS HKY Results", sep = ""))
print(matrix(c("Total samples", sum(col.names),
"mean", mean(target_summary["tract_length", col.names]),
"sd", sd(target_summary["tract_length", col.names])), 2, 3))
}
## [1] "Tract = 3.0 combined PSJS HKY Results"
## [,1] [,2] [,3]
## [1,] "Total samples" "mean" "sd"
## [2,] "99" "3.35636357555957" "2.40699716566058"
## [1] "Tract = 10.0 combined PSJS HKY Results"
## [,1] [,2] [,3]
## [1,] "Total samples" "mean" "sd"
## [2,] "100" "9.79420203503121" "7.17615180954532"
## [1] "Tract = 50.0 combined PSJS HKY Results"
## [,1] [,2] [,3]
## [1,] "Total samples" "mean" "sd"
## [2,] "98" "50.3095661868595" "85.6445738100097"
## [1] "Tract = 100.0 combined PSJS HKY Results"
## [,1] [,2] [,3]
## [1,] "Total samples" "mean" "sd"
## [2,] "100" "85.7367890914407" "122.076518037225"
## [1] "Tract = 200.0 combined PSJS HKY Results"
## [,1] [,2] [,3]
## [1,] "Total samples" "mean" "sd"
## [2,] "100" "107.854006985987" "137.763222339365"
## [1] "Tract = 300.0 combined PSJS HKY Results"
## [,1] [,2] [,3]
## [1,] "Total samples" "mean" "sd"
## [2,] "99" "172.173660004663" "222.141070349374"
## [1] "Tract = 400.0 combined PSJS HKY Results"
## [,1] [,2] [,3]
## [1,] "Total samples" "mean" "sd"
## [2,] "99" "204.587166782426" "234.80346992204"
## [1] "Tract = 500.0 combined PSJS HKY Results"
## [,1] [,2] [,3]
## [1,] "Total samples" "mean" "sd"
## [2,] "99" "304.221267655645" "563.101225535658"
for(tract in Tract.list){
sim.info <- get(paste("sim.tract.", toString(tract), sep = ""))
PSJS.info <- get(paste("PSJS_HKY_Tract_", toString(tract), "_combined_summary", sep = ""))
shared.col <- colnames(PSJS.info)[PSJS.info["tract_length", ] < 10000 & PSJS.info["tract_length", ] > 1.]
# Now show the ratio of PSJS estimated tract / actual mean tracts in simulation
PSJS.ratio <- PSJS.info["tract_length", shared.col]/sim.info[1, shared.col]
hist(PSJS.ratio, main = paste("PSJS ratio Tract = ", toString(tract), sep = ""))
print(matrix(c("Tract", tract, "PSJS mean", mean(PSJS.ratio), "PSJS sd", sd(PSJS.ratio)), 2, 3))
}
## [,1] [,2] [,3]
## [1,] "Tract" "PSJS mean" "PSJS sd"
## [2,] "3" "0.0114764947112412" "0.0269021956964372"
## [,1] [,2] [,3]
## [1,] "Tract" "PSJS mean" "PSJS sd"
## [2,] "10" "0.0784491503718006" "0.0568692826532482"
## [,1] [,2] [,3]
## [1,] "Tract" "PSJS mean" "PSJS sd"
## [2,] "50" "2.48986487158233" "5.23413675636928"
## [,1] [,2] [,3]
## [1,] "Tract" "PSJS mean" "PSJS sd"
## [2,] "100" "6.51817135170871" "8.91221188549571"
## [,1] [,2] [,3]
## [1,] "Tract" "PSJS mean" "PSJS sd"
## [2,] "200" "14.3815208218789" "20.6537471861418"
## [,1] [,2] [,3]
## [1,] "Tract" "PSJS mean" "PSJS sd"
## [2,] "300" "37.8336862816968" "68.9900189948574"
## [,1] [,2] [,3]
## [1,] "Tract" "PSJS mean" "PSJS sd"
## [2,] "400" "44.4437409374203" "59.1296466255201"
## [,1] [,2] [,3]
## [1,] "Tract" "PSJS mean" "PSJS sd"
## [2,] "500" "79.7533127072861" "146.379973740167"
save workspace now
save.image("./SimulationStudy.RData")